suppressMessages({
  library(stringr)
  library(tidyverse)
  library(gdata)
  library(base)
  library(scales)
  library(RColorBrewer)
  library(igraph)
  library(qgraph)
  library(ggpubr)
  library(ggrepel)
  library(ggpattern)
  library(purrr)
  library(ergm)
  library(intergraph)
  library(netplot)
  library(broom)
  library(pander)
  library(gtools)
})

Cleaned dataset Format

I. Define functions

1. Open csv dataset and remove blank spaces

# Open csv file and add n_words column  
load_and_n_words <- function(movie_name){
  df_name <- paste0('directed-datasets/', movie_name, '-cl-directed.csv')
  df <- read.csv(df_name, na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8")
  
  for (i in 1:nrow(df)){
    df$nwords[i] <- str_count(df$text[i], '\\w+')
  }
  return(df)
}

# Remove leading and trailing blanks in name and text 
remove_empty_space2 <- function(df){
  df$from <- trim(df$from)
  df$text <- trim(df$text)
  df$directed <- trim(df$directed)
  df$to <- trim(df$to)  
  
  return(df)
}

2. Compute number of words

# 1) Create a df with number of words of ONLY DIRECTED dialogues 
create_df_directed <- function(df){
  df_directed <- df %>% 
    filter(df$directed == 1) %>% 
    group_by(from, to) %>% 
    summarize(weight = sum(nwords))
  
  df_directed <- merge(df_directed, df_0_all_movies[,c("from", "to", "movie")], by = c("from", "to"), all.x = TRUE)
  df_directed <- df_directed[!duplicated(df_directed), ]
  row.names(df_directed) <- 1:nrow(df_directed)
  
  return(df_directed)
}

# 2) Create a df with number of words of ALL dialogues 
create_df_both <- function(df){
  df_both <- df %>% 
    group_by(from) %>% 
    summarize(n_words_total = sum(nwords))

  row.names(df_both) <- 1:nrow(df_both)
  
  return(df_both)
}

3. Create igraph object

###라고 표시한 부분을 수정했음

create_igraph <- function(df_movie){
  # Compute number of words 
  # for (1) directed dialogues and (2) both directed and undirected 
  my_df_dir <- df_movie %>% 
    create_df_directed()
  my_df_both <- df_movie %>% 
    create_df_both()
  
  # Create character matrix (to create adjacency matrix)
  chars <- my_df_both$from
  from <- c()
  to <- c()
  
  for (i in 1:length(chars)){
    from <- append(from, rep(chars[i], times = length(chars)))
    }
  
  to <- append(to, rep(chars, times = length(chars)))
  my_char_mat <- data.frame(from, to) # my char matrix
  remove(chars, from, to) # remove unnecessary objects 
  
  ## Merge n_words_dialogue to each directed dialogue 
  my_char_mat <- merge(my_char_mat, 
                       my_df_dir[,c("from", "to", "weight")],  
                       by = c("from", "to"), 
                       all.x = TRUE)
  ## Remove duplicates
  my_char_mat <- my_char_mat[!duplicated(my_char_mat), ]
  
  ## Merge gender to my_df_both 
  named_char <- read.csv('0-named-char.csv', na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8") %>% 
    rename(from = X.U.FEFF.from)
  my_df_both <- merge(my_df_both, named_char[,c("from", "gender")], by = "from", all.x = TRUE)
  
  
  # Create graph object 
  v <- unique(c(my_char_mat[,1], 
                my_char_mat[,2]))
  v <- na.omit(v)
  e <- na.omit(my_char_mat)
  g <- graph.data.frame(e, vertices = v, directed = T)
  
  # Add attributes 
  V(g)$n_words_total <- my_df_both$n_words_total
  V(g)$gender <- my_df_both$gender

  return(g)
  } # end of function definition 

3.1 Visualize character network

# 모든 영화를 한번에 plot 하기 - 얘만하면 됨 

plot_char_net_rm_isolates <- function(my_network, my_movie){
  # remove isolates
  my_network <- delete_vertices(my_network, degree(my_network) == 0)
  # layout 
  e <- get.edgelist(my_network, names=FALSE)

  if (gorder(my_network) < 18){
    my_layout <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(my_network),
                                                 area=2.2*(vcount(my_network)^1.15), repulse.rad=(vcount(my_network)^1))    
  } else if (gorder(my_network) < 35){
    my_layout <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(my_network),
                                                 area=2.25*(vcount(my_network)^2.1), repulse.rad=(vcount(my_network)^1.5))   
  } else {
    my_layout <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(my_network),
                                                 area=3*(vcount(my_network)^2.1), repulse.rad=(vcount(my_network)^1.5))   
  }

  # plot 
  plot.igraph(my_network,
              vertex.size = rescale(V(my_network)$n_words_total, c(3, 25)),
              vertex.color = adjustcolor(ifelse(V(my_network)$gender == 0, "#009E73", 
                      ifelse(V(my_network)$gender == 1, "#E69F00", "#999999")),
                      alpha.f = .6),
              vertex.label = ifelse(V(my_network)$name %in% named_char_major$from, V(my_network)$name, ""),
              vertex.label.cex = rescale(V(my_network)$n_words_total, c(0.8, 1.6)),
              vertex.label.family = "Cambria",
              vertex.label.font = 2, # bold text 
              vertex.label.color = "black",
              edge.width = rescale(E(my_network)$weight, c(0.05, 10)), ###
              edge.arrow.size = 0.45,
              edge.color = adjustcolor("azure3", alpha.f = .6),
              edge.curved = 0.25, 
              layout = my_layout,
              main = paste0("Character Network for <", my_movie, ">")
              )
  
  legend("topright", legend = c("Woman", "Man"), 
         inset=c(0.05, 0), cex = 0.9, pch = 19, 
         col = c("#009E73", "#E69F00"), title = "Gender" 
       )
}

4. Compute centrality

# Compute centrality 
compute_centrality <- function(my_network, my_movie){
  # In- and out-degree centrality 
  id <- degree(my_network, mode="in")
  od <- degree(my_network, mode = "out")
  # Betweenness
  bet <- betweenness(my_network, weights = E(my_network)$weight)
  # Closeness
  clo <- closeness(my_network, weights = E(my_network)$weight)
  
  # n_words_total
  n_words_total <- V(my_network)$n_words_total
  
  my_df <- data.frame(from=names(id), id=id, od=od, bet=bet, clo=clo, n_words = n_words_total, row.names=NULL)
  my_df$movie <- my_movie
  
  remove(id, od, bet, clo)
  
  return(my_df)
}

II. Apply

Load each transcript (xxxx-cl-directed.csv)

# movie list in alphabetical order 
movie_list <- c("aladdin19", "aladdin", "beauty17", "beauty", "brave", "cind", "frog", "frozen2", "frozen", 
                "incredibles1", "incredibles2", "inside", "mermaid2", "mermaid", "moana", "mulan", "poca", "raya", 
                "sleeping", "snow", "tangled", "toystory1", "toystory2", "toystory3", "toystory4")

for (i in 1:length(movie_list)){
  
  tmp <- load_and_n_words(movie_list[i])
  tmp$movie <- movie_list[i]
  
  assign(paste0("df_3_", movie_list[i]), tmp)
}
remove(tmp)

Merge all preprocessed data of all films

# Merge all preprocessed data of all films 

# movie list in release year order 
movie_list <- c("snow", "cind", "sleeping", "mermaid", "mermaid2", "beauty", "beauty17", "aladdin", "aladdin19",
                "poca", "mulan", "frog", "tangled", "brave", "frozen", "frozen2", "moana", "raya", 
                "toystory1", "toystory2", "toystory3", "toystory4", 
                "incredibles1", "incredibles2", "inside")
df_0_all_movies <- data.frame()
for (i in 1:length(movie_list)){
  df_tmp <- load_and_n_words(movie_list[i])
  df_tmp$movie <- movie_list[i]
  df_0_all_movies <- rbind(df_0_all_movies, df_tmp) %>% 
    remove_empty_space2
}
remove(df_tmp)

2. Calculate number of words

# For each directed dialogue
df_1_dir <- create_df_directed(df_0_all_movies)
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
# For each character 
df_1_both <- create_df_both(df_0_all_movies)
## Warning: Setting row names on a tibble is deprecated.
# Get all unique characters who interacted with another (either from or to)
all_char <- append(df_1_dir$from, df_1_dir$to) %>% unique()
df_1_char <- data.frame(from = all_char)
df_1_char <- merge(df_1_char, df_1_dir[,c("from", "movie")], by = "from") %>% unique() %>% arrange(movie)

# Save to csv 
write.csv(df_1_char,"0_unique_char.csv", row.names = F, fileEncoding = 'utf8')

3. Create igraph object

# 모든 영화를 한번에 graph 만들기 
# Apply 
movie_list <- c("snow", "cind", "sleeping", "mermaid", "mermaid2", "beauty", "beauty17", "aladdin", "aladdin19",
                "poca", "mulan", "frog", "tangled", "brave", "frozen", "frozen2", "moana", "raya", 
                "toystory1", "toystory2", "toystory3", "toystory4", 
                "incredibles1", "incredibles2", "inside", "nemo")

for (i in 1:length(movie_list)){
  tmp <- load_and_n_words(movie_list[i])
  tmp$movie <- movie_list[i]
  
  my_net <- create_igraph(tmp)
  
  assign(paste0("net_", movie_list[i]), my_net)
  remove(tmp, my_net)
}

3.1 Visualize character network

픽사 영화들은 노드 간 거리를 좀 두자

named_char_major <-  read.csv('0-named-char-major2.csv', na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8") %>%
  rename(from = X.U.FEFF.from)

movie_list <- c("snow", "cind", "sleeping", "mermaid", "mermaid2", "beauty", "beauty17", "aladdin", "aladdin19",
                "poca", "mulan", "frog", "tangled", "brave", "frozen", "frozen2", "moana", "raya",
                "toystory1", "toystory2", "toystory3", "toystory4",
                "incredibles1", "incredibles2", "inside", "nemo")
movie_name_list <- c("Snow White and the Seven Dwarfs", "Cinderella", "Sleeping Beauty", "Litte Mermaid", "Little Mermaid 2",
                     "Beauty and the Beast", "Beauty and the Beast (2017)", "Aladdin", "Aladdin (2019)",
                     "Pocahontas", "Mulan", "Princess and the Frog", "Tangled", "Brave", "Frozen", "Frozen 2", "Moana", "Raya and the Last Dragon",
                     "Toy Story 1", "Toy Story 2", "Toy Story 3", "Toy Story 4",
                     "Incredibles", "Incredibles 2", "Inside Out", "Finding Nemo")

plot_list = list()
for (i in 1:length(movie_list)){
  # load each movie 
  tmp <- load_and_n_words(movie_list[i])
  tmp$movie <- movie_list[i]
  
  # create network
  my_net <- create_igraph(tmp)
  # save network object
  assign(paste0("net_", movie_list[i]), my_net)   
  
  # plot
  my_plot <- plot_char_net_rm_isolates(my_net, movie_name_list[i])
  my_plot
  
  # save plots to a list 
  plot_list[[i]] <- my_plot

  remove(tmp, my_net, my_plot)
}

4. Compute centrality

# Apply 
movie_list <- c("snow", "cind", "sleeping", "mermaid", "mermaid2", "beauty", "beauty17", "aladdin", "aladdin19",
                "poca", "mulan", "frog", "tangled", "brave", "frozen", "frozen2", "moana", "raya", 
                "toystory1", "toystory2", "toystory3", "toystory4", 
                "incredibles1", "incredibles2", "inside", "nemo")
year_list <- c(1937, 1950, 1959, 1989, 2000, 1991, 2017, 1992, 2019, 
               1995, 1998, 2009, 2010, 2012, 2013, 2019, 2016, 2021,
               1995, 1999, 2010, 2019, 
               2004, 2018, 2015, 2003)

df_2_centrality <- data.frame()
for (i in 1:length(movie_list)){
  tmp <- load_and_n_words(movie_list[i])
  tmp$movie <- movie_list[i]
  
  my_net <- create_igraph(tmp)
  my_centrality <- compute_centrality(my_net, movie_list[i])
  
  df_2_centrality <- rbind(df_2_centrality, my_centrality)
  remove(tmp, my_net)
}
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
# Get character names 
named_char <- read.csv('0-named-char.csv', na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8") %>% 
  rename(from = X.U.FEFF.from)
char_names <- named_char$from

# Get years 
year <- read.csv('0-year.csv', na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8") %>% 
  rename(movie = X.U.FEFF.movie)

# Merge char names and year 
df_2_centrality <- merge(df_2_centrality, named_char[,c("from", "gender")], by = "from", all.x = TRUE) 

for (i in 1:length(movie_list)){
  df_2_centrality$year[df_2_centrality$movie == movie_list[i]] <- year_list[i]
}
df_2_centrality$year <- as.factor(df_2_centrality$year)

remove(id, od, bet, clo)

Visualize centrality

한 영화씩

Movies with sequels

Toy Story의 경우 line plot으로 각 캐릭터의 centrality가 1-4편으로 갈수록 어떻게 바뀌는지 봐야할듯

All movies

df_2_centrality 만드는 코드

# Number of words - PIXAR 
df <- df_2_centrality %>% 
  drop_na(gender) %>% 
  group_by(movie) %>% 
  mutate(n_words_perc = round(n_words/sum(n_words)*100, 2)) %>% 
  arrange(year, from)

df_2_centrality_cl <- df 

df$pixar <- ifelse(grepl("toystory", df$movie), "Pixar", 
                   ifelse(grepl("incredibles", df$movie), "Pixar", "DisneyPrincess"))

1-1. 각 영화당 값 1개로

여성 1인당 말한 정도 and 남성 1인당 말한 정도 (perc_per)

df_4_movie_name <- read.csv('0_movie_name.csv', na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8") %>% 
  rename(movie = X.U.FEFF.movie) 

# n_words_total aggregated by gender 
df_2_perc_words <- df_2_centrality_cl %>%
  group_by(movie, gender) %>% 
  summarize(perc_subtotal = sum(n_words_perc), 
            pop_subtotal = n()) %>% 
  mutate(perc_per = perc_subtotal/pop_subtotal)
## `summarise()` has grouped output by 'movie'. You can override using the `.groups` argument.
df_2_perc_words <- merge(df_2_perc_words, 
                         df_2_centrality_cl[,c("movie", "year", "gender")], 
                         by = c("movie", "gender"), 
                         all.x = TRUE) %>% unique() 
df_2_perc_words <- merge(df_2_perc_words, 
                         df_4_movie_name[,c("movie", "company", "movie_name")], 
                         by = c("movie"), 
                         all.x = TRUE) 
df <- df_2_perc_words
# df

df_tmp <- df %>% filter(gender == 0)


# Subtotal for all women  
ggplot(df, 
       aes(x=reorder(movie_name, as.numeric(year)), y=perc_subtotal, group = as.factor(gender)), label = df_tmp$movie_name) +
  geom_line(aes(color=as.factor(gender)), cex = 0.45)+
  geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
  # scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
  scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
  scale_shape_manual(values=c(16, 3), name = "Company") + 
  ylab("% of number of words spoken") + 
  xlab("Film") + 
  ggtitle("How much do women characters talk?") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
        plot.title = element_text(size=14, face="bold")) 

remove(df_tmp)

# Per person for each gender 
ggplot(df, aes(x=reorder(movie_name, as.numeric(year)), y=perc_per, group = as.factor(gender)), label = df$movie_name) +
  geom_line(aes(color=as.factor(gender)), cex = 0.45)+
  geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
  # scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
  scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
  scale_shape_manual(values=c(16, 3), name = "Company", labels = c("Disney Princess", "Pixar")) +
  ylab("% of number of words spoken (per person)") + 
  xlab("Film") + 
  ggtitle("How much do each gender talk per one person?") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
        plot.title = element_text(size=14, face="bold")) 

df <- df_2_perc_words
df_tmp <- df %>% filter(gender == 1) %>% 
  mutate(men_talk = perc_subtotal)

df <- merge(df, df_tmp[,c("movie_name", "men_talk")], by = "movie_name", all.x = TRUE)
df
##                         movie_name        movie gender perc_subtotal
## 1                          Aladdin      aladdin      0         10.41
## 2                          Aladdin      aladdin      1         89.58
## 3                   Aladdin (2019)    aladdin19      0         18.78
## 4                   Aladdin (2019)    aladdin19      1         81.24
## 5             Beauty and the Beast       beauty      0         31.61
## 6             Beauty and the Beast       beauty      1         68.41
## 7      Beauty and the Beast (2017)     beauty17      0         31.40
## 8      Beauty and the Beast (2017)     beauty17      1         68.59
## 9                            Brave        brave      0         73.87
## 10                           Brave        brave      1         26.13
## 11                      Cinderella         cind      0         61.56
## 12                      Cinderella         cind      1         38.45
## 13                    Finding Nemo         nemo      0         23.73
## 14                    Finding Nemo         nemo      1         76.26
## 15                          Frozen       frozen      0         49.03
## 16                          Frozen       frozen      1         50.97
## 17                        Frozen 2      frozen2      0         54.58
## 18                        Frozen 2      frozen2      1         45.43
## 19                   Incredibles 1 incredibles1      0         46.39
## 20                   Incredibles 1 incredibles1      1         53.60
## 21                  Incredibles 2  incredibles2      0         46.84
## 22                  Incredibles 2  incredibles2      1         53.14
## 23                     Inside Out        inside      0         64.23
## 24                     Inside Out        inside      1         35.78
## 25                 Little Mermaid       mermaid      0         34.93
## 26                 Little Mermaid       mermaid      1         65.09
## 27                Little Mermaid 2     mermaid2      0         54.09
## 28                Little Mermaid 2     mermaid2      1         45.92
## 29                           Moana        moana      0         50.36
## 30                           Moana        moana      1         49.65
## 31                           Mulan        mulan      0         26.02
## 32                           Mulan        mulan      1         73.98
## 33                      Pocahontas         poca      0         31.94
## 34                      Pocahontas         poca      1         68.05
## 35        Raya and the Last Dragon         raya      0         77.60
## 36        Raya and the Last Dragon         raya      1         22.39
## 37                Sleeping Beauty      sleeping      0         74.69
## 38                Sleeping Beauty      sleeping      1         25.32
## 39 Snow White and the Seven Dwarfs         snow      0         47.41
## 40 Snow White and the Seven Dwarfs         snow      1         52.59
## 41                         Tangled      tangled      0         54.38
## 42                         Tangled      tangled      1         45.62
## 43       The Princess and the Frog         frog      0         41.47
## 44       The Princess and the Frog         frog      1         58.54
## 45                     Toy Story 1    toystory1      0          8.61
## 46                     Toy Story 1    toystory1      1         91.41
## 47                     Toy Story 2    toystory2      0         12.43
## 48                     Toy Story 2    toystory2      1         87.57
## 49                     Toy Story 3    toystory3      0         22.47
## 50                     Toy Story 3    toystory3      1         77.50
## 51                     Toy Story 4    toystory4      0         40.93
## 52                     Toy Story 4    toystory4      1         59.06
##    pop_subtotal  perc_per year        company men_talk
## 1             3  3.470000 1992 DisneyPrincess    89.58
## 2            19  4.714737 1992 DisneyPrincess    89.58
## 3             6  3.130000 2019 DisneyPrincess    81.24
## 4            15  5.416000 2019 DisneyPrincess    81.24
## 5             7  4.515714 1991 DisneyPrincess    68.41
## 6            16  4.275625 1991 DisneyPrincess    68.41
## 7            14  2.242857 2017 DisneyPrincess    68.59
## 8            18  3.810556 2017 DisneyPrincess    68.59
## 9             6 12.311667 2012 DisneyPrincess    26.13
## 10            8  3.266250 2012 DisneyPrincess    26.13
## 11            6 10.260000 1950 DisneyPrincess    38.45
## 12            7  5.492857 1950 DisneyPrincess    38.45
## 13            7  3.390000 2003          Pixar    76.26
## 14           20  3.813000 2003          Pixar    76.26
## 15            5  9.806000 2013 DisneyPrincess    50.97
## 16           22  2.316818 2013 DisneyPrincess    50.97
## 17            7  7.797143 2019 DisneyPrincess    45.43
## 18           15  3.028667 2019 DisneyPrincess    45.43
## 19           13  3.568462 2004          Pixar    53.60
## 20           23  2.330435 2004          Pixar    53.60
## 21            8  5.855000 2018          Pixar    53.14
## 22           18  2.952222 2018          Pixar    53.14
## 23            8  8.028750 2015          Pixar    35.78
## 24           10  3.578000 2015          Pixar    35.78
## 25            8  4.366250 1989 DisneyPrincess    65.09
## 26           12  5.424167 1989 DisneyPrincess    65.09
## 27            8  6.761250 2000 DisneyPrincess    45.92
## 28           18  2.551111 2000 DisneyPrincess    45.92
## 29            5 10.072000 2016 DisneyPrincess    49.65
## 30            7  7.092857 2016 DisneyPrincess    49.65
## 31            9  2.891111 1998 DisneyPrincess    73.98
## 32           21  3.522857 1998 DisneyPrincess    73.98
## 33            3 10.646667 1995 DisneyPrincess    68.05
## 34           12  5.670833 1995 DisneyPrincess    68.05
## 35           11  7.054545 2021 DisneyPrincess    22.39
## 36            7  3.198571 2021 DisneyPrincess    22.39
## 37            5 14.938000 1959 DisneyPrincess    25.32
## 38            5  5.064000 1959 DisneyPrincess    25.32
## 39            2 23.705000 1937 DisneyPrincess    52.59
## 40           10  5.259000 1937 DisneyPrincess    52.59
## 41            2 27.190000 2010 DisneyPrincess    45.62
## 42           11  4.147273 2010 DisneyPrincess    45.62
## 43            5  8.294000 2009 DisneyPrincess    58.54
## 44           20  2.927000 2009 DisneyPrincess    58.54
## 45            4  2.152500 1995          Pixar    91.41
## 46           13  7.031538 1995          Pixar    91.41
## 47           10  1.243000 1999          Pixar    87.57
## 48           19  4.608947 1999          Pixar    87.57
## 49           12  1.872500 2010          Pixar    77.50
## 50           28  2.767857 2010          Pixar    77.50
## 51           26  1.574231 2019          Pixar    59.06
## 52           25  2.362400 2019          Pixar    59.06
# Subtotal for all women  
ggplot(df, 
       aes(x=reorder(movie_name, as.numeric(men_talk)), 
           y=perc_subtotal, 
           fill = as.factor(gender),
           pattern= company)) +
  geom_bar_pattern(position = "fill", stat = "identity", width = 0.5,
                   color = "black", 
                   pattern_fill = "darkgrey",
                   pattern_angle = 45,
                   pattern_density = 0.03,
                   pattern_spacing = 0.04,
                   pattern_key_scale_factor = 0.6,
                   pattern_alpha = 0.6,
                   alpha = 0.7) + 
  # geom_line(aes(color=as.factor(gender)), cex = 0.45)+
  # geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
  # scale_fill_discrete(name = "Gender", labels = c("Woman", "Man")) +
  scale_fill_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
  # scale_shape_manual(values=c(16, 3), name = "Company") + 
  scale_pattern_manual(values = c(DisneyPrincess = "none", Pixar = "stripe"), name = "Company",
                       labels = c("Disney Princess", "Pixar")) + 
  ylab("Percent of number of words spoken") + 
  scale_y_continuous(expand = c(0, 0)) +
  xlab("Film") + 
  ggtitle("How much do women characters talk?") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
        plot.title = element_text(size=14, face="bold")) +
  coord_flip()

* https://stackoverflow.com/questions/22945651/remove-space-between-plotted-data-and-the-axes *

sequels and live-actions

# centrality aggregated by gender 
df_2_cent_agg <- df_2_centrality_cl %>%
  group_by(movie, gender) %>% 
  summarize(id_subtotal = sum(id),
            od_subtotal = sum(od),
            bet_subtotal = sum(bet), 
            clo_subtotal = sum(clo),
            pop_subtotal = n()) %>% 
  mutate(id_avg = id_subtotal/pop_subtotal,
         od_avg = od_subtotal/pop_subtotal, 
         bet_avg = bet_subtotal/pop_subtotal,
         clo_avg = clo_subtotal/pop_subtotal)
## `summarise()` has grouped output by 'movie'. You can override using the `.groups` argument.
df_2_cent_agg <- merge(df_2_cent_agg, 
                         df_2_centrality_cl[,c("movie", "year", "gender")], 
                         by = c("movie", "gender"), 
                         all.x = TRUE) %>% unique() 
df_2_cent_agg <- merge(df_2_cent_agg, 
                         df_4_movie_name[,c("movie", "company", "movie_name")], 
                         by = c("movie"), 
                         all.x = TRUE) 
df <- df_2_cent_agg

df
##           movie gender id_subtotal od_subtotal bet_subtotal clo_subtotal
## 1       aladdin      0          10           9         92.0  0.001994976
## 2       aladdin      1          47          48        490.0  0.011986981
## 3     aladdin19      0          15          16        155.0  0.002771864
## 4     aladdin19      1          45          39        529.0  0.007879078
## 5        beauty      0          23          23        244.0  0.001767533
## 6        beauty      1          49          47        378.0  0.004263861
## 7      beauty17      0          35          35        716.5  0.004169734
## 8      beauty17      1          53          49       1139.5  0.005184970
## 9         brave      0          20          15        107.0  0.005017272
## 10        brave      1          15          18         83.0  0.009916703
## 11         cind      0          29          31        244.0  0.004760541
## 12         cind      1          19          12        111.0  0.004458738
## 13         frog      0          35          30        906.5  0.002058810
## 14         frog      1          46          47        778.0  0.007658809
## 15       frozen      0          31          21        525.5  0.001171757
## 16       frozen      1          49          49       1067.5  0.005063107
## 17      frozen2      0          27          28        347.0  0.008796414
## 18      frozen2      1          33          32        406.5  0.017508230
## 19 incredibles1      0          33          32       1128.0  0.002764981
## 20 incredibles1      1          61          58       2655.0  0.005063317
## 21 incredibles2      0          45          41       1033.0  0.002643835
## 22 incredibles2      1          61          57       1418.0  0.005820550
## 23       inside      0          34          26        402.0  0.002125551
## 24       inside      1          19          17        186.0  0.002562292
## 25      mermaid      0          18          17        197.0  0.006987713
## 26      mermaid      1          33          34        360.0  0.007231591
## 27     mermaid2      0          35          32        421.0  0.003025539
## 28     mermaid2      1          35          36        534.5  0.006583553
## 29        moana      0          15          14        102.0  0.004032799
## 30        moana      1          11          11         75.0  0.006343107
## 31        mulan      0          26          26        478.0  0.003427057
## 32        mulan      1          58          55       1101.0  0.008415391
## 33         nemo      0          23          22        394.0  0.001623869
## 34         nemo      1          96          81       2760.5  0.004837884
## 35         poca      0          10          11         55.0  0.003303143
## 36         poca      1          38          37        316.0  0.012786503
## 37         raya      0          31          29        364.0  0.008780085
## 38         raya      1          11          13         53.0  0.005895338
## 39     sleeping      0          20          23        126.0  0.014473174
## 40     sleeping      1          12           8          8.0  0.008620217
## 41         snow      0          13          13        102.0  0.003875765
## 42         snow      1          32          29        120.5  0.018322089
## 43      tangled      0           7           7         16.0  0.004206349
## 44      tangled      1          21          21         93.0  0.022173783
## 45    toystory1      0           6           9         15.0  0.001819299
## 46    toystory1      1          46          42        346.5  0.006604050
## 47    toystory2      0          13          16        296.0  0.002975672
## 48    toystory2      1          81          70       1374.0  0.005619041
## 49    toystory3      0          34          36        376.0  0.004044206
## 50    toystory3      1         100          98       2739.5  0.009741828
## 51    toystory4      0          74          64       2724.5  0.006052527
## 52    toystory4      1          77          82       2734.0  0.005991913
##    pop_subtotal   id_avg   od_avg    bet_avg      clo_avg year        company
## 1             3 3.333333 3.000000  30.666667 0.0006649919 1992 DisneyPrincess
## 2            19 2.473684 2.526316  25.789474 0.0006308937 1992 DisneyPrincess
## 3             6 2.500000 2.666667  25.833333 0.0004619773 2019 DisneyPrincess
## 4            15 3.000000 2.600000  35.266667 0.0005252719 2019 DisneyPrincess
## 5             7 3.285714 3.285714  34.857143 0.0002525047 1991 DisneyPrincess
## 6            16 3.062500 2.937500  23.625000 0.0002664913 1991 DisneyPrincess
## 7            14 2.500000 2.500000  51.178571 0.0002978382 2017 DisneyPrincess
## 8            18 2.944444 2.722222  63.305556 0.0002880539 2017 DisneyPrincess
## 9             6 3.333333 2.500000  17.833333 0.0008362120 2012 DisneyPrincess
## 10            8 1.875000 2.250000  10.375000 0.0012395879 2012 DisneyPrincess
## 11            6 4.833333 5.166667  40.666667 0.0007934234 1950 DisneyPrincess
## 12            7 2.714286 1.714286  15.857143 0.0006369625 1950 DisneyPrincess
## 13            5 7.000000 6.000000 181.300000 0.0004117621 2009 DisneyPrincess
## 14           20 2.300000 2.350000  38.900000 0.0003829405 2009 DisneyPrincess
## 15            5 6.200000 4.200000 105.100000 0.0002343515 2013 DisneyPrincess
## 16           22 2.227273 2.227273  48.522727 0.0002301412 2013 DisneyPrincess
## 17            7 3.857143 4.000000  49.571429 0.0012566305 2019 DisneyPrincess
## 18           15 2.200000 2.133333  27.100000 0.0011672154 2019 DisneyPrincess
## 19           13 2.538462 2.461538  86.769231 0.0002126909 2004          Pixar
## 20           23 2.652174 2.521739 115.434783 0.0002201442 2004          Pixar
## 21            8 5.625000 5.125000 129.125000 0.0003304794 2018          Pixar
## 22           18 3.388889 3.166667  78.777778 0.0003233639 2018          Pixar
## 23            8 4.250000 3.250000  50.250000 0.0002656939 2015          Pixar
## 24           10 1.900000 1.700000  18.600000 0.0002562292 2015          Pixar
## 25            8 2.250000 2.125000  24.625000 0.0008734641 1989 DisneyPrincess
## 26           12 2.750000 2.833333  30.000000 0.0006026326 1989 DisneyPrincess
## 27            8 4.375000 4.000000  52.625000 0.0003781924 2000 DisneyPrincess
## 28           18 1.944444 2.000000  29.694444 0.0003657530 2000 DisneyPrincess
## 29            5 3.000000 2.800000  20.400000 0.0008065599 2016 DisneyPrincess
## 30            7 1.571429 1.571429  10.714286 0.0009061581 2016 DisneyPrincess
## 31            9 2.888889 2.888889  53.111111 0.0003807842 1998 DisneyPrincess
## 32           21 2.761905 2.619048  52.428571 0.0004007329 1998 DisneyPrincess
## 33            7 3.285714 3.142857  56.285714 0.0002319813 2003          Pixar
## 34           20 4.800000 4.050000 138.025000 0.0002418942 2003          Pixar
## 35            3 3.333333 3.666667  18.333333 0.0011010478 1995 DisneyPrincess
## 36           12 3.166667 3.083333  26.333333 0.0010655419 1995 DisneyPrincess
## 37           11 2.818182 2.636364  33.090909 0.0007981896 2021 DisneyPrincess
## 38            7 1.571429 1.857143   7.571429 0.0008421911 2021 DisneyPrincess
## 39            5 4.000000 4.600000  25.200000 0.0028946347 1959 DisneyPrincess
## 40            5 2.400000 1.600000   1.600000 0.0017240434 1959 DisneyPrincess
## 41            2 6.500000 6.500000  51.000000 0.0019378827 1937 DisneyPrincess
## 42           10 3.200000 2.900000  12.050000 0.0018322089 1937 DisneyPrincess
## 43            2 3.500000 3.500000   8.000000 0.0021031746 2010 DisneyPrincess
## 44           11 1.909091 1.909091   8.454545 0.0020157984 2010 DisneyPrincess
## 45            4 1.500000 2.250000   3.750000 0.0004548246 1995          Pixar
## 46           13 3.538462 3.230769  26.653846 0.0005080038 1995          Pixar
## 47           10 1.300000 1.600000  29.600000 0.0002975672 1999          Pixar
## 48           19 4.263158 3.684211  72.315789 0.0002957390 1999          Pixar
## 49           12 2.833333 3.000000  31.333333 0.0003370172 2010          Pixar
## 50           28 3.571429 3.500000  97.839286 0.0003479224 2010          Pixar
## 51           26 2.846154 2.461538 104.788462 0.0002327895 2019          Pixar
## 52           25 3.080000 3.280000 109.360000 0.0002396765 2019          Pixar
##                         movie_name
## 1                          Aladdin
## 2                          Aladdin
## 3                   Aladdin (2019)
## 4                   Aladdin (2019)
## 5             Beauty and the Beast
## 6             Beauty and the Beast
## 7      Beauty and the Beast (2017)
## 8      Beauty and the Beast (2017)
## 9                            Brave
## 10                           Brave
## 11                      Cinderella
## 12                      Cinderella
## 13       The Princess and the Frog
## 14       The Princess and the Frog
## 15                          Frozen
## 16                          Frozen
## 17                        Frozen 2
## 18                        Frozen 2
## 19                   Incredibles 1
## 20                   Incredibles 1
## 21                  Incredibles 2 
## 22                  Incredibles 2 
## 23                     Inside Out 
## 24                     Inside Out 
## 25                 Little Mermaid 
## 26                 Little Mermaid 
## 27                Little Mermaid 2
## 28                Little Mermaid 2
## 29                           Moana
## 30                           Moana
## 31                           Mulan
## 32                           Mulan
## 33                    Finding Nemo
## 34                    Finding Nemo
## 35                      Pocahontas
## 36                      Pocahontas
## 37        Raya and the Last Dragon
## 38        Raya and the Last Dragon
## 39                Sleeping Beauty 
## 40                Sleeping Beauty 
## 41 Snow White and the Seven Dwarfs
## 42 Snow White and the Seven Dwarfs
## 43                         Tangled
## 44                         Tangled
## 45                     Toy Story 1
## 46                     Toy Story 1
## 47                     Toy Story 2
## 48                     Toy Story 2
## 49                     Toy Story 3
## 50                     Toy Story 3
## 51                     Toy Story 4
## 52                     Toy Story 4
# centrality measures 

centrality_lineplot <- function(y, ylab, title, subtitle){
  p <- ggplot(df, 
       aes(x=reorder(movie_name, as.numeric(year)), y=y, group = as.factor(gender)), label = df$movie_name) +
  geom_line(aes(color=as.factor(gender)), cex = 0.45)+
  geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +

  scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +  
  # scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
  scale_shape_manual(values=c(16, 3), name = "Company") + 
  ylab("") + 
  xlab("Film") + 
  labs(title = title,
       subtitle = subtitle) + 
  theme_minimal() + 
  theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
        plot.title = element_text(size = 13, face="bold")) 
  
  return(p)
}
centrality_lineplot(df$id_subtotal, "In-degree centrality (subtotal)", "Are women spoken to more characters than men?", "Indegree centrality (subtotal)")

centrality_lineplot(df$id_avg, "In-degree centrality (average)", "Is one woman spoken to more characters than one man?", "Indegree centrality (average)")

centrality_lineplot(df$od_subtotal, "Out-degree centrality (subtotal)", "Do women speak to more characters than men?", "Outdegree centrality (subtotal)")

centrality_lineplot(df$od_avg, "Out-degree centrality (average)", "Does one woman speak to more characters than one man?", "Outdegree centrality (average)")

centrality_lineplot(df$bet_subtotal, "Betweenness centrality (subtotal)", "Are women acting like a bridge more than men?", "Betweenness centrality (subtotal)")

centrality_lineplot(df$bet_avg, "Betweenness centrality (average)", "Does one woman act like a bridge more than one man?", "Betweenness centrality (average)")

centrality_lineplot(df$clo_subtotal, "Closeness centrality (subtotal)", "Are women in the center of the network?", "Closeness centrality (subtotal)")

centrality_lineplot(df$clo_avg, "Closeness centrality (average)", "Is one woman in the center more than one man?", "Closeness centrality (average)")

2-1. sequels을 facet으로 묶어보기

ts <- c("toystory1", "toystory2", "toystory3", "toystory4")
inc <- c("incredibles1", "incredibles2")
mer <- c("mermaid", "mermaid2") 
ala <- c("aladdin", "aladdin19")
bea <- c("beauty", "beauty17")
fro <- c("frozen", "frozen2")

seq <- c("toystory1", "toystory2", "toystory3", "toystory4", "incredibles1", "incredibles2", 
         "mermaid", "mermaid2", "aladdin", "aladdin19", "beauty", "beauty17", "frozen", "frozen2")

df_tmp <- df_2_perc_words %>% 
  filter(movie %in% seq) %>% 
  mutate(series = ifelse(movie %in% ts, "Toy Story", 
                         ifelse(movie %in% inc, "Incredibles", 
                                ifelse(movie %in% mer, "Little Mermaid", 
                                       ifelse(movie %in% ala, "Aladdin", 
                                              ifelse(movie %in% bea, "Beauty and the Beast", "Frozen"))))))
# Subtotal for all women  
ggplot(df_tmp, 
       aes(x=movie_name, y=perc_subtotal, group = as.factor(gender)), label = df_tmp$movie_name) +
  geom_line(aes(color=as.factor(gender)), cex = 0.45)+
  geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
  # scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
  scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
  scale_shape_manual(values=c(16, 3), name = "Company") + 
  ylab("% of number of words spoken") + 
  xlab("Film") + 
  ggtitle("How much do women characters talk?") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
        plot.title = element_text(size=14, face="bold"),
        strip.text = element_blank()) +
  facet_grid(. ~ series, scales = "free")

# Per person for each gender 
ggplot(df_tmp, 
       aes(x=movie_name, y=perc_per, group = as.factor(gender)), label = df_tmp$movie_name) +
  geom_line(aes(color=as.factor(gender)), cex = 0.45)+
  geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
  # scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
  scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
  scale_shape_manual(values=c(16, 3), name = "Company") +
  ylab("% of number of words spoken (per person)") + 
  xlab("Film") + 
  ggtitle("How much do each gender talk per one person?") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
        plot.title = element_text(size=14, face="bold"),
        strip.text = element_blank()) +
  facet_grid(. ~ series, scales = "free")

remove(df_tmp)
centrality_lineplot_seq <- function(y, ylab, title, subtitle){
  p <- ggplot(df, 
       aes(x=reorder(movie_name, as.numeric(year)), y=y, group = as.factor(gender)), label = df$movie_name) +
  geom_line(aes(color=as.factor(gender)), cex = 0.45)+
  geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +

  scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +  
  # scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
  scale_shape_manual(values=c(16, 3), name = "Company") + 
  ylab("") + 
  xlab("Film") + 
  labs(title = title,
       subtitle = subtitle) + 
  theme_minimal() + 
  theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
        plot.title = element_text(size = 13, face="bold")) +
  facet_grid(. ~ series, scales = "free")
  
  return(p)
}

5. ERGM

# 1) Compute Degree Centrality (defined by Park et al., 2011) -> X
unique_char <- unique(df_1_dir$from)

tmp_out <-  df_1_dir%>% 
  group_by(movie, from) %>% 
  summarize(out_weight = sum(weight)) %>% 
  rename(char_name = from)
## `summarise()` has grouped output by 'movie'. You can override using the `.groups` argument.
tmp_in <- df_1_dir %>% 
  group_by(movie, to) %>% 
  summarize(in_weight = sum(weight)) %>% 
  rename(char_name = to)
## `summarise()` has grouped output by 'movie'. You can override using the `.groups` argument.
tmp <- merge(tmp_out, tmp_in, by = c("char_name", "movie"), all.y = TRUE) %>% 
  na.omit()
tmp$boonja <- tmp$out_weight + tmp$in_weight
tmp$multiplied <- tmp$out_weight*tmp$in_weight
boonmo <- 2*sum(tmp$multiplied)
tmp$dc <- tmp$boonja/boonmo

df_4_dc <- tmp
remove(tmp, tmp_in, tmp_out)
files <- list.files(path = "4_ergm_models/")

df_5_ergm <- data.frame()
for (i in 1:length(files)) {
  df_tmp = read.csv(paste0("4_ergm_models/", files[i]), header = T, sep = ",")
  df_tmp = subset(df_tmp, select = -c(std.error, mcmc.error, statistic, p.value))

  df_5_ergm <- rbind(df_5_ergm, df_tmp)
}
write.csv(df_5_ergm, file.path("4_ergm_models_edited/0_ergm_models.csv"))
# Apply 
movie_list <- c("snow", "cind", "sleeping", "mermaid", "mermaid2", "beauty", "beauty17", "aladdin", "aladdin19",
                "poca", "mulan", "frog", "tangled", "brave", "frozen", "frozen2", "moana", "raya", 
                "toystory1", "toystory2", "toystory3", "toystory4", 
                "incredibles1", "incredibles2", "inside", "nemo")

# rbind all films into one dataframe for each model 
df_5_m1 <- data.frame()
df_5_m2 <- data.frame()
df_5_m3 <- data.frame()
df_5_m4 <- data.frame()

for (i in 1:4){
  df_tmp1 <- data.frame()
  
  for (j in 1:length(movie_list)){
    df_tmp2 <- read.csv(paste0("4_ergm_models/m", i, "_", movie_list[j], ".csv"), 
                       na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8")
    df_tmp2$film <- ""
    df_tmp2$film[1] <- movie_list[j]
    df_tmp2 <- df_tmp2 %>% 
      select(film, everything())
    
    df_tmp1 <- rbind(df_tmp1, df_tmp2)
    
  }
  if (i == 1){
    df_5_m1 <- df_tmp1
  }
  if (i == 2){
    df_5_m2 <- df_tmp1
  }
  if (i == 3){
    df_5_m3 <- df_tmp1
  }
  if (i == 4){
    df_5_m4 <- df_tmp1
  }
}